필요한 library

#install.packages(c("ggthemes", "magrittr", "data.table", "stringr", "rvest", "janitor", "lubridate", "ggmap", "gganimate", "leaflet", "leaflet.extras"))
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(magrittr)
library(data.table)
library(stringr)
# Fetching
library(rvest)
# Cleaning column names
library(janitor)
# Date/Time formatting
library(lubridate)
# Maps
library(ggmap)
library(sf)
# Used for animated density plots
library(gganimate)
# Only needed for interactive maps
library(leaflet)
library(leaflet.extras)
library(readxl)
library(gridExtra)
library(RColorBrewer)
library(googleway)

자료 불러오기

setwd("C:/Users/User/Desktop/대학원/3학기/통계 그래픽스/과제/HW4")
rides2015<-rbind(read.csv("Divvy_Trips_2015-Q1Q2/Divvy_Trips_2015-Q1.csv"),
                 read.csv("Divvy_Trips_2015-Q1Q2/Divvy_Trips_2015-Q2.csv"),
                 read.csv("Divvy_Trips_2015_Q3Q4/Divvy_Trips_2015_07.csv"),
                 read.csv("Divvy_Trips_2015_Q3Q4/Divvy_Trips_2015_08.csv"),
                 read.csv("Divvy_Trips_2015_Q3Q4/Divvy_Trips_2015_09.csv"),
                 read.csv("Divvy_Trips_2015_Q3Q4/Divvy_Trips_2015_Q4.csv"))
station2015<-read.csv("Divvy_Trips_2015-Q1Q2/Divvy_Stations_2015.csv")
weather2015 <- read.csv("C:/Users/User/Desktop/대학원/3학기/통계 그래픽스/과제/HW5/725340-14819-2015/725340-14819-2015.csv", sep="")

자료 변환

  1. rides
# Age
rides2015$age <- 2015-rides2015$birthyear
c(min(rides2015$age, na.rm=T), max(rides2015$age, na.rm=T))
## [1]  16 117
rides2015$age_bin <- rides2015$age %>% .bincode(seq(0,120,20))
rides2015$age_bin <- sapply(rides2015$age_bin,function(bin) {
  return(paste0((bin-1)*20,"-",(bin*20)," Years Old"))
})
rides2015$age_bin<-factor(rides2015$age_bin,levels=c("0-20 Years Old","20-40 Years Old","40-60 Years Old","60-80 Years Old","80-120 Years Old","NA-NA Years Old"))

# Time
rides2015$starttime<-strptime(as.character(rides2015$starttime),format="%m/%d/%Y %H:%M")
rides2015$stoptime<-strptime(as.character(rides2015$stoptime),format="%m/%d/%Y %H:%M")
rides2015$starttime <- as.POSIXct(rides2015$starttime)
rides2015$stoptime <- as.POSIXct(rides2015$stoptime)

## t시간 변수 만들기 
# Trip times
rides2015$minutes <- rides2015$tripduration/60
rides2015$hours <- rides2015$tripduration/60/60
# Start times
rides2015$start_hour <- lubridate::hour(rides2015$starttime)
rides2015$mm <- hour(rides2015$starttime)*60 + minute(rides2015$starttime)
rides2015$start_day <- wday(rides2015$starttime,label =  T, abbr = F, week_start = 1)
# Weekend/Weekday
rides2015$start_day_type <- ifelse(wday(rides2015$starttime, week_start = 1)>5, "Weekend", "Weekday")
# Day of Year
rides2015$day <- yday(rides2015$starttime)
# Week of year
rides2015$week <- week(rides2015$starttime)
# Month (1-12)
rides2015$month <- month(rides2015$starttime,label = T,abbr = F)
# Month (January-December)
rides2015$month_text <- month(rides2015$starttime,label = T,abbr = F)
# Remove unused levels from factor
rides2015$month_text <- droplevels(rides2015$month_text)
rides2015
  1. station - join with rides2015
station2015 <- station2015 %>% select(-name)
rides2015 <- rides2015 %>%
  left_join(station2015,by=c("from_station_id"="id"))
colnames(rides2015)[25:28]<-c("from_station_latitude","from_station_longitude","from_dpcapacity", "from_landmark")
rides2015 <- rides2015 %>%
  left_join(station2015,by=c("to_station_id"="id"))
colnames(rides2015)[29:32]<-c("to_station_latitude","to_station_longitude","to_dpcapacity","to_landmark")

rides2015
  1. weather
for (i in 1:ncol(weather2015)){
  weather2015[,i] <- ifelse(weather2015[,i]==-9999, NA, weather2015[,i])
}
colnames(weather2015) <- c("year", "month", "day", "hour", "temp", "dew_temp", "slp", "wd", "wsr", "sctcc", "per", "per_six")
summary(weather2015)
##       year          month             day             hour      
##  Min.   :2015   Min.   : 1.000   Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:2015   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 6.00  
##  Median :2015   Median : 7.000   Median :16.00   Median :12.00  
##  Mean   :2015   Mean   : 6.527   Mean   :15.72   Mean   :11.50  
##  3rd Qu.:2015   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:17.75  
##  Max.   :2015   Max.   :12.000   Max.   :31.00   Max.   :23.00  
##                                                                 
##       temp           dew_temp            slp              wd       
##  Min.   :-211.0   Min.   :-272.00   Min.   : 9867   Min.   :  0.0  
##  1st Qu.:  33.0   1st Qu.: -22.00   1st Qu.:10129   1st Qu.: 80.0  
##  Median : 128.0   Median :  50.00   Median :10169   Median :200.0  
##  Mean   : 116.6   Mean   :  46.22   Mean   :10176   Mean   :176.6  
##  3rd Qu.: 211.0   3rd Qu.: 133.00   3rd Qu.:10224   3rd Qu.:250.0  
##  Max.   : 344.0   Max.   : 256.00   Max.   :10448   Max.   :360.0  
##                                     NA's   :34      NA's   :131    
##       wsr             sctcc            per             per_six      
##  Min.   :  0.00   Min.   :0.000   Min.   : -1.000   Min.   : -1.00  
##  1st Qu.: 31.00   1st Qu.:2.000   1st Qu.:  0.000   1st Qu.: -1.00  
##  Median : 41.00   Median :2.000   Median :  0.000   Median :  3.00  
##  Mean   : 44.65   Mean   :3.221   Mean   :  0.988   Mean   : 21.02  
##  3rd Qu.: 57.00   3rd Qu.:4.000   3rd Qu.:  0.000   3rd Qu.: 23.00  
##  Max.   :154.00   Max.   :9.000   Max.   :269.000   Max.   :325.00  
##  NA's   :1        NA's   :4953    NA's   :24        NA's   :8386
# NA 많은 열 제거
weather2015 <- weather2015[,-c(10,12)]
# 시간 변수들 합치기
weather2015 <- weather2015 %>% 
  unite(time, year:hour, sep = "-")
weather2015$time<-strptime(as.character(weather2015$time),format="%Y-%m-%d-%H")
weather2015$time <- as.POSIXct(weather2015$time)
weather2015$month <- month(weather2015$time,label = T,abbr = F)
weather2015[,c(2,3,6)] <- weather2015[,c(2,3,6)]/10
weather2015

Distribution

ggplot(rides2015) + geom_histogram(aes(age)) +
  theme_bw() + ggtitle("2015 Trip duration distribution") +
  geom_vline(xintercept=80, col=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 929855 rows containing non-finite values (stat_bin).

ggplot(rides2015) + geom_histogram(aes(tripduration)) +
  theme_bw() + ggtitle("2015 Trip duration distribution") +
  geom_vline(xintercept=4000, col=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(rides2015 %>% filter(tripduration<=4000)) +
  geom_histogram(aes(tripduration)) + theme_bw() +
  ggtitle("2015 Trip duration distribution less than 4000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rides2015 %>% filter(tripduration>4000) %>% group_by(usertype) %>% summarise(n=n())

자전거를 빌린 기간이 긴 usertype은 Customer 그룹이 Subscriber보다 많다.

rides2015 %>% group_by(usertype) %>% summarise(n=n())
rides2015 <- rides2015 %>% filter(usertype!="Dependent")

Dependent 수가 적기 때문에 제외하였다.

Age of riders

ggplot(data=rides2015 %>% filter(age<=80), 
       aes(x=week, fill=month_text)) + 
  geom_histogram(alpha=.9, binwidth=1) + 
  theme_fivethirtyeight() + 
  ggtitle("2015 Ride Frequency") +
  facet_grid(.~age_bin,scale="free") +
  scale_fill_viridis_d()

## Usertype ~ Age
ggplot(data=rides2015 %>% filter(age<=80), 
       aes(x=week, fill=month_text)) + 
  geom_histogram(alpha=.9, bins=53) + 
  theme_fivethirtyeight() + 
  ggtitle("2015 Ride Frequency") + facet_grid(usertype~age_bin,scale="free") + 
  scale_fill_viridis_d()

## Usertype & start_day_type & Age
ggplot(rides2015 %>% filter(age<=80),
       aes(x=mm, fill= age_bin)) +
  geom_density(alpha=.6) +
  scale_x_continuous(labels = c("5am","8am","1:30pm","5pm","8pm"),
                     breaks = c(300,480,750,1020,1200)) + 
  labs(fill="",title="NiceRide 2015 Start Times") + 
  theme_fivethirtyeight() +
  theme(strip.background = element_rect(fill = "#FFFFFF")) +
  scale_fill_viridis_d(option="A") +
  facet_grid(usertype~start_day_type)
## Warning: Groups with fewer than two data points have been dropped.

Each day

rides2015 %>% mutate(day1=day(starttime)) %>% filter(day1==1) %>% 
  group_by(month) %>%
  select(day1, day, month) %>% summarise(first(day))
ggplot() +
  geom_area(data=rides2015, 
            aes(x=day, y=..count.., fill=usertype),
            stat="bin", alpha=0.6) +
  scale_fill_viridis_d() +
  scale_x_continuous(labels = c("Jan","Feb","Mar","Apr","May","June","July","Aug","Sep",
                                "Oct","Nov","Dec"),
                     breaks = c(1,32,60,91,121,152,182,213,244,274,305,335)) +
  labs(fill="",title="2015 Trips on each day") +
  theme_fivethirtyeight() +
  facet_grid(usertype~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Trips on each weekday

ggplot() + 
  geom_histogram(data=rides2015, 
                 aes(x=start_day, fill=usertype), 
                 stat="count", alpha=0.6) +
  facet_grid(usertype~., scales="free") + 
  labs(fill="",title="2015 Trips on each weekday") +
  theme_fivethirtyeight()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

heatmap

rides2015_naomit <- na.omit(rides2015)
df.heatmap.weekday <- list()
df.heatmap.weekday$weekday <- rides2015_naomit %>%
  filter(start_day_type=="Weekday") %>%
  group_by(from_station_longitude, from_station_latitude) %>%
  summarize(intensity = sqrt(n()))
names(df.heatmap.weekday$weekday)[1:2] <- c("longitude","latitude")

df.heatmap.weekday$weekend <- rides2015_naomit %>% 
  filter(start_day_type=="Weekend") %>%
  group_by(from_station_longitude, from_station_latitude) %>%
  summarize(intensity = sqrt(n()))
names(df.heatmap.weekday$weekend)[1:2] <- c("longitude","latitude")

df.heatmap.weekday$weekday$pos <- "weekday"
df.heatmap.weekday$weekend$pos <- "weekend"

df.heatmap.weekday %<>% rbindlist(fill = T)
leaflet(df.heatmap.weekday) %>% 
  addProviderTiles(providers$CartoDB.DarkMatter) %>%
  addHeatmap(data = df.heatmap.weekday %>% filter(pos=="weekday"),
             lng=~longitude, 
             lat=~latitude, 
             intensity = ~intensity,
             blur = 10, 
             max = 100, radius = 15,
             layerId = "weekday", group = "weekday") %>%
  addHeatmap(data = df.heatmap.weekday %>% filter(pos=="weekend"),
             lng=~longitude, 
             lat=~latitude, 
             intensity = ~intensity,
             blur = 10, 
             max = 100, radius = 15,
             layerId = "weekend", group = "weekend") %>%
  addLayersControl(
    baseGroups = c("weekday", "weekend"),
    options = layersControlOptions(collapsed = FALSE)
  )
  • 주말의 면적이 주중의 면적보다 작다.

주말 새벽

rides2015_am <- rides2015 %>% filter(mm<=300 & start_day_type=="Weekend")
rides2015_am %>% group_by(usertype, gender) %>% summarise(n=n())
# Subscriber만
ggplot() +
  geom_density(data=rides2015_am %>% 
                 filter(usertype=="Subscriber"), 
                 aes(x=mm, fill="Total"), stat="count", alpha=0.6) +
  geom_density(data=rides2015_am %>% 
                 filter(usertype=="Subscriber"&gender=="Male"), 
                 aes(x=mm, fill="Male"),  stat="count", alpha=0.6) +
  geom_density(data=rides2015_am %>% 
                 filter(usertype=="Subscriber"&gender=="Female"), 
                 aes(x=mm, fill="Female"),  stat="count", alpha=0.6) + 
  scale_fill_viridis_d(option="A") +
  scale_x_continuous(labels = c("1am", "2am", "3am", "4am", "5am"),
                     breaks = c(60, 120, 180, 240, 300)) +
  labs(fill="",title="Trips on Weekend(Subscriber)") +
  theme_fivethirtyeight()

대부분 남자가 빌렸다는 것을 알 수 있다.

ggplot(rides2015_am) +
  geom_boxplot(aes(x=month, y=mm)) + 
  labs(title="Trips on Weekend") +
  theme_fivethirtyeight()

월과는 상관이 없어 보인다.

ggplot(rides2015_am %>% filter(age<=80)) + 
  geom_histogram(aes(x=start_hour, fill=age_bin), stat="count", position="dodge") +
  labs(x="",fill="",title="Trips on Weekend") +
  scale_x_discrete(
    breaks = c(0,1,2,3,4,5),
    labels=c("12~1am","1~2am","2~3am","3~4am","4~5am","5~6am")) +
  theme_fivethirtyeight() +
  scale_fill_viridis_d()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

대부분 20~40대

rides2015_am %>% group_by(usertype) %>% summarise(n=n())

전체 데이터에서는 Subscriber 그룹이 훨씬 많았지만 주말 새벽에는 거의 비슷하다.

Top Stations

rides2015 %>%
  group_by(from_station_name) %>%
  summarize(freq = n()) %>%
  top_n(10) %>%
  ggplot(aes(reorder(from_station_name, freq), y = freq, fill = freq)) +   
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Pick-up Station") +
  ylab("Frequency") +
  coord_flip() +
  theme_fivethirtyeight() + 
  theme(legend.position ='none', axis.text.y = element_text(size = 12)) + 
  scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral'))) +
  ggtitle("Top 10 Pick-up Stations")
## Selecting by freq

rides2015 %>%
  group_by(to_station_name) %>%
  summarize(freq = n()) %>%
  top_n(10) %>%
  ggplot(aes(reorder(to_station_name, freq), y = freq, fill = freq)) +   
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Pick-up Station") +
  ylab("Frequency") +
  coord_flip() +
  theme_fivethirtyeight() + 
  theme(legend.position ='none', axis.text.y = element_text(size = 12)) + 
  scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral'))) +
  ggtitle("Top 10 Put-down Stations")
## Selecting by freq

rides2015_naomit <- na.omit(rides2015)
df.lines2015 <- rides2015_naomit %>%
  group_by(from_station_longitude,
           from_station_latitude,
           to_station_longitude,
           to_station_latitude,
           from_station_name,
           to_station_name) %>%
  summarize(rides = n())
highlights <- arrange(df.lines2015, desc(rides))[1:10,]
highlights
mpls2015 <- get_map(c(left = min(rides2015_naomit$from_station_longitude), 
                  bottom = min(rides2015_naomit$from_station_latitude), 
                  right = max(rides2015_naomit$from_station_longitude), 
                  top = max(rides2015_naomit$from_station_latitude)),
                maptype='terrain', source='stamen', zoom=13)
## 45 tiles needed, this may take a while (try a smaller zoom).
## Source : http://tile.stamen.com/terrain/13/2099/3040.png
## Source : http://tile.stamen.com/terrain/13/2100/3040.png
## Source : http://tile.stamen.com/terrain/13/2101/3040.png
## Source : http://tile.stamen.com/terrain/13/2102/3040.png
## Source : http://tile.stamen.com/terrain/13/2103/3040.png
## Source : http://tile.stamen.com/terrain/13/2099/3041.png
## Source : http://tile.stamen.com/terrain/13/2100/3041.png
## Source : http://tile.stamen.com/terrain/13/2101/3041.png
## Source : http://tile.stamen.com/terrain/13/2102/3041.png
## Source : http://tile.stamen.com/terrain/13/2103/3041.png
## Source : http://tile.stamen.com/terrain/13/2099/3042.png
## Source : http://tile.stamen.com/terrain/13/2100/3042.png
## Source : http://tile.stamen.com/terrain/13/2101/3042.png
## Source : http://tile.stamen.com/terrain/13/2102/3042.png
## Source : http://tile.stamen.com/terrain/13/2103/3042.png
## Source : http://tile.stamen.com/terrain/13/2099/3043.png
## Source : http://tile.stamen.com/terrain/13/2100/3043.png
## Source : http://tile.stamen.com/terrain/13/2101/3043.png
## Source : http://tile.stamen.com/terrain/13/2102/3043.png
## Source : http://tile.stamen.com/terrain/13/2103/3043.png
## Source : http://tile.stamen.com/terrain/13/2099/3044.png
## Source : http://tile.stamen.com/terrain/13/2100/3044.png
## Source : http://tile.stamen.com/terrain/13/2101/3044.png
## Source : http://tile.stamen.com/terrain/13/2102/3044.png
## Source : http://tile.stamen.com/terrain/13/2103/3044.png
## Source : http://tile.stamen.com/terrain/13/2099/3045.png
## Source : http://tile.stamen.com/terrain/13/2100/3045.png
## Source : http://tile.stamen.com/terrain/13/2101/3045.png
## Source : http://tile.stamen.com/terrain/13/2102/3045.png
## Source : http://tile.stamen.com/terrain/13/2103/3045.png
## Source : http://tile.stamen.com/terrain/13/2099/3046.png
## Source : http://tile.stamen.com/terrain/13/2100/3046.png
## Source : http://tile.stamen.com/terrain/13/2101/3046.png
## Source : http://tile.stamen.com/terrain/13/2102/3046.png
## Source : http://tile.stamen.com/terrain/13/2103/3046.png
## Source : http://tile.stamen.com/terrain/13/2099/3047.png
## Source : http://tile.stamen.com/terrain/13/2100/3047.png
## Source : http://tile.stamen.com/terrain/13/2101/3047.png
## Source : http://tile.stamen.com/terrain/13/2102/3047.png
## Source : http://tile.stamen.com/terrain/13/2103/3047.png
## Source : http://tile.stamen.com/terrain/13/2099/3048.png
## Source : http://tile.stamen.com/terrain/13/2100/3048.png
## Source : http://tile.stamen.com/terrain/13/2101/3048.png
## Source : http://tile.stamen.com/terrain/13/2102/3048.png
## Source : http://tile.stamen.com/terrain/13/2103/3048.png
ggmap(mpls2015,darken = c(.8,"#FFFFFF")) + 
  geom_segment(data = df.lines2015,
               aes(x = from_station_longitude, 
                   y = from_station_latitude,
                   xend = to_station_longitude,
                   yend = to_station_latitude,
                   alpha = sqrt(rides)),
               color = "#000000") + coord_cartesian() +
  scale_alpha(range = c(0.0001, .5)) +
  geom_point(data = df.lines2015 %>% 
               group_by(longitude = from_station_longitude,
                        latitude = from_station_latitude) %>%
               summarize(rides = sum(rides)),
             aes(x = longitude, 
                 y = latitude,
                 size = rides),
             color="#009900",alpha=.4) + 
  geom_point(data = highlights,
             aes(x = from_station_longitude, 
                 y = from_station_latitude,
                 size = rides*1.5),
             color="red",alpha=.4) +
  scale_size_continuous(range(4,100)) +
  scale_color_viridis_c() + 
  scale_fill_viridis_c() + 
  theme_nothing()
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

rides2015_am_naomit <- na.omit(rides2015_am)
df.lines2015_am <- rides2015_am_naomit %>%
  group_by(from_station_longitude,
           from_station_latitude,
           to_station_longitude,
           to_station_latitude,
           from_station_name,
           to_station_name) %>%
  summarize(rides = n())
df.lines2015_am$region <- 1:nrow(df.lines2015_am)
df.lines2015_am_from <- df.lines2015_am[,c(1,2,5,7,8)]
colnames(df.lines2015_am_from) <- c("longitude", "latitude","name", "rides", "region")
df.lines2015_am_to <- df.lines2015_am[,c(3,4,6,7,8)]
colnames(df.lines2015_am_to) <- c("longitude", "latitude", "name", "rides", "region")
df.lines2015_am2 <- rbind(df.lines2015_am_from, df.lines2015_am_to)
df.lines2015_am2$region <- as.factor(df.lines2015_am2$region)
highlights <- arrange(df.lines2015_am, desc(rides))[1:10,]
highlights
highlights2 <- arrange(df.lines2015_am2, desc(rides))[1:20,]

map <- leaflet() %>%
  addTiles() %>% 
  setView(median(highlights2$longitude), 
          median(highlights2$latitude),
          zoom = 11) %>%
  addMarkers(data = highlights, 
                    lng = ~from_station_longitude,
             lat = ~from_station_latitude, 
             popup = ~paste(paste("from:", from_station_name),
                            paste("rides:", rides),sep="<br/>")) %>%
  addAwesomeMarkers(data = highlights, lng = ~to_station_longitude,
             lat = ~to_station_latitude, 
             popup = ~paste(paste("to:", to_station_name),
                            paste("rides:", rides),sep="<br/>"),
             icon = awesomeIcons(markerColor="green")) %>%
  addCircleMarkers(data=df.lines2015_am2, lng = ~longitude, lat = ~latitude, opacity = 0.5, radius = ~rides/2, popup = ~rides)
for(group in levels(highlights2$region)){
  map = addPolylines(map, 
                      lng= ~ longitude,
                      lat= ~ latitude,
                      data = highlights2[highlights2$region == group,], color="red")
}
map

Trip Duration

ggplot(rides2015 %>% filter(age<=80&tripduration<=4000)) + 
  geom_density(aes(tripduration, fill=age_bin), alpha=0.6) +
  facet_grid(usertype~start_day_type) + 
  theme_fivethirtyeight() + 
  labs(fill="", title="2015 Trip Duration") +
  scale_fill_viridis_d(option="A") +
  scale_x_continuous(
    labels = c("5 minutes","10 minutes","20 minutes","30 minutes","50 miutes"),
    breaks = c(300, 600, 1200, 1800, 3000)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Groups with fewer than two data points have been dropped.

ggplot(rides2015 %>% filter(age<=80&tripduration>4000)) + 
  geom_density(aes(tripduration, fill=age_bin), alpha=0.6) +
  facet_grid(usertype~start_day_type) + 
  theme_fivethirtyeight() + 
  labs(fill="", title="2015 Trip Duration(after 1hour)") +
  scale_fill_viridis_d(option="A") +
  scale_x_continuous(
    labels = c("1hour","10 hour"),
    breaks = c(3600, 36000))

ggplot(rides2015 %>% filter(tripduration>4000)) + 
  geom_density(aes(tripduration, fill=usertype), alpha=0.6) +
  facet_grid(.~start_day_type) + 
  theme_fivethirtyeight() + 
  labs(fill="", title="2015 Trip Duration(after 1hour)") +
  scale_fill_viridis_d(option="A") +
  scale_x_continuous(
    labels = c("1hour","10 hour"),
    breaks = c(3600, 36000))

Customer & Weekend에서 trip duration이 높은 역을 찾아보자

top_dur <- rides2015 %>% 
  filter(usertype=="Customer" & start_day_type=="Weekend") %>% 
  group_by(from_station_name, to_station_name, from_landmark, to_landmark) %>% 
  summarise(mean_dur=mean(tripduration)) %>% 
  arrange(desc(mean_dur))
top_dur[1:10,]

Calender Heatmap

library(ggTimeSeries)
# Generate frequency table
df.cal <- rides2015$starttime %>% as_date() %>% table %>% data.frame
names(df.cal) <- c("Date","Rides")
df.cal$Date %<>% as_date

ggplot_calendar_heatmap(
  df.cal,
  'Date',
  'Rides'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

df_w <- weather2015
df_w$Date <- as_date(df_w$time)
df_w <- df_w %>% group_by(Date) %>%
  summarise("temperature" = mean(temp),
            "precipitation" = sum(per)) 

# Temperature  
ggplot_calendar_heatmap(
  df_w,
  'Date',
  'temperature'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

# Precipitation
ggplot_calendar_heatmap(
  df_w,
  'Date',
  'precipitation'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

자전거를 많이 빌리는 날에도 강수량이 높은 날에는 많이 안 타는 것으로 보인다.

Weather

df <- left_join(df_w, df.cal)
## Joining, by = "Date"
df$month <- month(df$Date,label = T,abbr = F)
ggplot(df, aes(x=temperature, y=Rides)) + geom_point() +
  geom_smooth(se=F) +
  theme_fivethirtyeight() +
  ggtitle("2015 Rides")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(df %>% filter(precipitation>0), 
       aes(x=precipitation, y=Rides)) + geom_point() +
  theme_fivethirtyeight() +
  ggtitle("2015 Rides")

7, 8, 9월 더 자세히

df_many <- df %>% filter(month %in% c("7월", "8월", "9월"))
gg1 <- ggplot_calendar_heatmap(
  df_many,
  'Date',
  'Rides'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

# Temperature  
gg2<-ggplot_calendar_heatmap(
  df_many,
  'Date',
  'temperature'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

# Precipitation
gg3 <- ggplot_calendar_heatmap(
  df_many,
  'Date',
  'precipitation'
) + theme_fivethirtyeight() + 
  theme(legend.position = "right",
        legend.direction = "vertical") + 
  scale_fill_viridis_c()

gg1;gg2;gg3

ggplot(data=df_many) +
  geom_point(aes(x=precipitation, y=Rides)) +
  facet_grid(.~month) +
  theme_fivethirtyeight() + 
  ggtitle("2015 Rides Frequency")
## Warning: Removed 1 rows containing missing values (geom_point).

제일 많이 타는 7, 8, 9월만 따로 보았다. 가장 두드러지는 점은 강수량의 Heatmap과 Rides의 Heatmap의 밝은부분과 어두운 부분이 반대로 나타난다는 것이다. 전반적으로 강수량이 증가하면 자전거를 타는 횟수도 줄어듦을 알 수 있다.